#setting directory
setwd("/Users/blessnimi/Desktop/78thUNGA")
library(udpipe)
library (tidytext)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(lattice)
library(stringi)
library(forcats)
library(quanteda)
## Package version: 3.3.1
## Unicode version: 14.0
## ICU version: 71.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following objects are masked from 'package:quanteda':
##
## meta, meta<-
##
## Attaching package: 'tm'
## The following object is masked from 'package:quanteda':
##
## stopwords
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(readxl)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
# Install required packages if not already installed
if (!require(udpipe)) {
install.packages("udpipe")
}
if (!require(wordcloud)) {
install.packages("wordcloud")
library(wordcloud)
}
## Loading required package: wordcloud
## Loading required package: RColorBrewer
db<- read.csv("UNGA_78_clean_corpus.csv")
speeches <- read_excel("/Users/blessnimi/Desktop/78thUNGA/78UNGA(2).xlsx")
#restructuring the data
db_new <- unnest_tokens(tbl=db, input=Text, output=word)
#creating a data frame with stop words
stp_wrds<- get_stopwords(source="smart")
#removing stopwords
db_new <- anti_join(db_new,stp_wrds,by= "word")
#sentiment analysis with bing lexicon /dictionary
bing<- get_sentiments(lexicon = "bing")
#joinning the data frames
db_bing <- inner_join(db_new, bing, by="word")
#computing sentiment counts by Country
db_bing<- count(db_bing,Country_Code, sentiment)
ud_model <- udpipe_download_model(language = "english", model_dir = "path/to/model", overwrite = FALSE)
ud_model <- udpipe_load_model(ud_model$file_model)
# Tokenizing and annotating the text
annotated_texts <- udpipe_annotate(ud_model, x = db$Cleaned_Text)
# Extracting part-of-speech tags
pos_tags <- as.data.frame(annotated_texts)
# db being an existing data frame, creating a column called doc_id
db$doc_id <- paste0("doc", 1:nrow(db))
# Merging based on doc_id
merged_df <- left_join(db, pos_tags, by = "doc_id")
country_mapping <- c(
"ao" = "Angola",
"bw" = "Botswana",
"cv" = "Cape Verde",
"eg" = "Egypt",
"er" = "Eritrea",
"et" = "Ethiopia",
"gm" = "Gambia",
"gh" = "Ghana",
"gw" = "Guinea-Bissau",
"ke" = "Kenya",
"lr" = "Liberia",
"ls" = "Lesotho",
"mu" = "Mauritius",
"mw" = "Malawi",
"mz" = "Mozambique",
"na" = "Namibia",
"ng" = "Nigeria",
"rw" = "Rwanda",
"sl" = "Sierra Leone",
"sc" = "Seychelles",
"ss" = "South Sudan",
"sz" = "Swaziland",
"tz" = "Tanzania",
"ug" = "Uganda",
"za" = "South Africa",
"zw" = "Zimbabwe"
)
# Updating the Country_Code column in merged_df using the mapping
merged_df$Country_Name <- country_mapping[merged_df$Country_Code]
The process begins by loading the speeches’ clean data. The data underwent cleaning, stemming, lemmatization, and categorization using the Universal Part-of-Speech (UPOS) system with the assistance of the udpipe R package.
# Convert 'Tokens' to a data frame with one row per word
word_data <- speeches %>%
unnest_tokens(word, Cleaned_Text)
# Calculate word count per country
word_count_per_country <- word_data %>%
count( Country_code, word, sort = TRUE) %>%
group_by( Country_code) %>%
summarise(Sentences = n(), Words = sum(n))
# Display the result
print(n=26, word_count_per_country)
## # A tibble: 26 × 3
## Country_code Sentences Words
## <chr> <int> <int>
## 1 ANGOLA 855 1465
## 2 BOTSWANA 593 888
## 3 CAPE VERDE 463 777
## 4 EGYPT 844 1419
## 5 ERITEA 368 459
## 6 ESWATINI' 695 1150
## 7 ETHIOPIA 614 912
## 8 GAMBIA 760 1336
## 9 GHANA 642 1033
## 10 GUINEA-BISSAU 336 512
## 11 KENYA 902 1444
## 12 LESOTHO 674 1050
## 13 LIBERIA 514 869
## 14 MALAWI 458 775
## 15 MAURITIUS 784 1225
## 16 MOZAMBIQUE 738 1178
## 17 NAMIBIA 582 939
## 18 NIGERIA 624 902
## 19 RWANDA 352 457
## 20 SEYCHELLES' 565 936
## 21 SIERRA LEONE 971 1651
## 22 SOUTH AFRICA 545 1029
## 23 SOUTH SUDAN 366 510
## 24 TANZANIA 608 931
## 25 UGANDA 607 1086
## 26 ZIMBABWE' 520 794
Based on the statistics shown in the Table 1, it is apparent that Sierra Leone exhibited the greatest frequency of sentences and words in their speech, followed by Angola, Kenya, Egypt, Gambia, Mauritius, and Mozambique. The countries of Rwanda, Eritrea, South Sudan, and Guinea-Bissau presented the shortest sentences and words in their speech, as stated in given table.
For a comprehensive list of the parts of speech (POS) tags and their corresponding definitions, please refer to this resource here. The provided code examines the distribution of each category independently.
pos_counts <- merged_df %>%
group_by(Country_Name) %>%
summarise(
SpeechLength = n(),
NOUN = sum(upos == "NOUN"),
VERB = sum(upos == "VERB"),
ADJ = sum(upos == "ADJ"),
ADV = sum(upos == "ADV")
)
# Reshape data for better plotting
pos_counts_long <- tidyr::gather(pos_counts, key = "PartOfSpeech", value = "Count",
-Country_Name, -SpeechLength)
# Plot
ggplot(pos_counts_long, aes(x =Count , y = PartOfSpeech, fill = PartOfSpeech)) +
geom_bar(stat = "identity") +
facet_wrap(~Country_Name, scales = "free") +
labs(title = "Part of Speech Distribution by Country",
x = "Part of Speech",
y = "Count") +
theme_minimal()+
guides(fill = FALSE) # Remove legend
From figure 1.1, it is evident that Sierra Leone possesses the longest speech, surpassing all other countries in this regard. Subsequently, Figure 1.1 also illustrates the frequency distribution of each UPOS type. Most of the speeches primarily comprise of nouns, adverbs, verbs, and adjectives.
noun_counts <- merged_df %>%
filter(upos == "NOUN") %>%
group_by(Country_Name) %>%
count(term = lemma, sort = TRUE) %>%
top_n(6, wt = n) # Adjust 10 to the desired number of top nouns
# Plot the most used nouns by country
ggplot(noun_counts, aes(x = n, y = fct_reorder(term, n), fill = term)) +
geom_col() +
geom_text(aes(label = n), hjust = -0.2, size = 3, color = "black") + # Add count labels on y-axis
facet_wrap(~ Country_Name, scales = "free_y") +
labs(title = "Top Nouns by Country",
x = "Count",
y = "Noun") +
theme_minimal()+
guides(fill = FALSE)+ # Remove legend
theme(axis.text.y = element_text(size = 10, margin = margin(0, 0, 0, 0))) # Adjust y-axis label size and add space between labels
verb_counts <- merged_df %>%
filter(upos == "VERB") %>%
group_by(Country_Name) %>%
count(term = lemma, sort = TRUE) %>%
top_n(6, wt = n) # Adjust 10 to the desired number of top verbs
# Plot the most used verbs by country with separate plots for each country
ggplot(verb_counts, aes(x = n, y = fct_reorder(term, n), fill = term)) +
geom_col() +
geom_text(aes(label = n), hjust = -0.2, size = 3, color = "black") + # Add count labels on y-axis
facet_wrap(~ Country_Name, scales = "free_y") +
labs(title = "Top Verbs by Country",
x = "Count",
y = "Verb") +
theme_minimal() +
guides(fill = FALSE) # Remove legend
adjective_counts <- merged_df %>%
filter(upos == "ADJ") %>%
group_by(Country_Name) %>%
count(term = lemma, sort = TRUE) %>%
top_n(4, wt = n) # Adjust 10 to the desired number of top adjectives
# Plot the most used adjectives by country with separate plots for each country
ggplot(adjective_counts, aes(x = n, y = fct_reorder(term, n), fill = term)) +
geom_col() +
geom_text(aes(label = n), hjust = -0.2, size = 3, color = "black") + # Add count labels on y-axis
facet_wrap(~ Country_Name, scales = "free_y") +
labs(title = "Top Adjectives by Country",
x = "Count",
y = "Adjective") +
theme_minimal() +
guides(fill = FALSE) # Remove legend
adverb_counts <- merged_df %>%
filter(upos == "ADV") %>%
group_by(Country_Name) %>%
count(term = lemma, sort = TRUE) %>%
top_n(4, wt = n) # Adjust 10 to the desired number of top adverbs
# Plot the most used adverbs by country with separate plots for each country
ggplot(adverb_counts, aes(x = n, y = fct_reorder(term, n), fill = term)) +
geom_col() +
geom_text(aes(label = n), hjust = -0.2, size = 3, color = "black") + # Add count labels on y-axis
facet_wrap(~ Country_Name, scales = "free_y") +
labs(title = "Top Adverbs by Country",
x = "Count",
y = "Adverb") +
theme_minimal() +
guides(fill = FALSE) # Remove legend
It is evident that Sierra Leone possesses the longest speech, surpassing all other countries in this regard. Subsequently, this sections also illustrates the frequency distribution of each UPOS type. Most of the speeches primarily comprise of nouns, adjectives, verbs, and adverbs. Based on the depicted picture, it can be observed that most national speeches exhibit a higher frequency of nouns and adjectives, followed by verbs, and finally adverbs. For a comprehensive list of the parts of speech (POS) tags and their corresponding definitions, please refer to this resource here .
The initial segment pertains to the frequency of words within each statement. This procedure calculates the words that exhibit the highest degree of exclusivity in a specific speech. This metric quantifies the degree of specificity exhibited by individual speeches in terms of their respective vocabularies. The second portion will incorporate a word cloud that encompasses all the aggregated statements, serving as an additional visual representation of the frequency of words used. The frequency of a word in the text is shown by its size.
# Create a data frame with word frequencies
word_freq <- merged_df %>%
group_by(Country_Name, lemma) %>%
summarise(freq = n()) %>%
arrange(desc(freq)) %>%
group_by(Country_Name) %>%
top_n(10, wt = freq)
## `summarise()` has grouped output by 'Country_Name'. You can override using the
## `.groups` argument.
# Plot the bar chart with facets
ggplot(word_freq, aes(x = freq, y = fct_reorder(lemma, freq), fill = lemma)) +
geom_col() +
facet_wrap(~ Country_Name, scales = "free_y") +
labs(title = "Ten Most Frequent Words in Country Statements",
x = "Word",
y = "Frequency") +
theme_minimal() +
guides(fill = FALSE) # Remove legend
# Tokenize and clean the text
mrged_df <- merged_df %>%
group_by(Country_Name) %>%
mutate(Cleaned_Text = tolower(Cleaned_Text),
Cleaned_Text = removePunctuation(Cleaned_Text),
Cleaned_Text = removeNumbers(Cleaned_Text),
Cleaned_Text = removeWords(Cleaned_Text, stopwords("english")),
Cleaned_Text = stripWhitespace(Cleaned_Text))
# Create a document-term matrix
dtm <- DocumentTermMatrix(Corpus(VectorSource(mrged_df$Cleaned_Text)))
# Convert the document-term matrix to a data frame
dtm_df <- as.data.frame(as.matrix(dtm))
# Calculate word frequencies
word_frequencies <- colSums(dtm_df)
# Select the top 20 most frequent words
top_words <- head(sort(word_frequencies, decreasing = TRUE), 20)
# Create a data frame for plotting
plot_data <- data.frame(word = names(top_words), freq = top_words)
# Plot a bar graph
ggplot(plot_data, aes(x = fct_reorder(word, freq), y = freq, fill = word)) +
geom_col() +
coord_flip() +
facet_wrap(~"", scales = "free_y", ncol = 1) +
labs(title = "Top 20 Most Frequent Words as a whole",
x = "Word",
y = "Frequency") +
theme_minimal() +
guides(fill = FALSE) # Remove legend
Upon analyzing the top 20 most frequent words, it becomes evident that the words “nations,” “global,” “development,” “word,” “peace,” “president,” “united,” and “climate” hold the highest frequency. However, the presence of these terms in isolation provides minimal assistance or guidance since they lack contextual information.
# Wordcloud
merged_df %>%
count(lemma, sort = TRUE) %>%
filter(nchar(lemma) <= 10) %>%
with(wordcloud(words = lemma, freq = n, max.words = 200, random.order = FALSE,
rot.per = 0.35, colors = brewer.pal(8, "Dark2")))
To summarize, the examination of word frequency, frequency statistics in the initial part yielded significant findings regarding the predominant usage of words in individual speeches. Nevertheless, it became apparent that specific terms acquired value alone when used in conjunction with others. To tackle this issue, the subsequent phase of the study placed emphasis on the identification and extraction of significant keyword combinations, recognizing the significance of contextual factors in facilitating a thorough comprehension of the claims. This methodology enriches the comprehensiveness of our analysis by encompassing subtle nuances that may not be readily discernible through the sole utilization of isolated word frequency metrics.
Frequency statistics of words are revealing, but one may find words which only make sense in combination with other words. Hence the goal of finding and extracting keywords which are a combination of words. The udpipe R package provides three method to identify keywords in text :
o RAKE (Rapid Automatic Keyword Extraction)
o Collocation ordering using Pointwise Mutual Information
o Parts of Speech phrase sequence detection
Both RAKE and PoS techniques are used to generate rankings of common keywords across all combined speeches. Using different algorithms, for the same purpose, are a useful ways of testing if different models perform in an expected, comparable way:
stats <- keywords_rake(mrged_df, term = "lemma", group = "Country_Code",
relevant = mrged_df$upos %in% c("NOUN", "ADJ"))
stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
barchart(key ~ rake, data = head(subset(stats, freq > 3), 20), col = "cadetblue",
main = "Keywords identified by RAKE",
xlab = "Rake")
The RAKE algorithm was employed to identify the prevailing keywords, which encompassed terms such as “lady” and “gentlemen,” “United Nations,” “climate change,” “sustainable development,” “international law,” “global community,” and “climate summit.”
mrged_df$word <- tolower(mrged_df$token)
stats <- keywords_collocation(x = mrged_df, term = "word", group = "doc_id")
stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
barchart(key ~ pmi, data = head(subset(stats, freq > 3), 20), col = "cadetblue",
main = "Keywords identified by PMI Collocation",
xlab = "PMI (Pointwise Mutual Information)")
Utilizing the PMI (Pointwise Mutual Information) framework, the prevailing keywords identified encompass the United Nations, the President, sustainable development, the General Assembly, climate change, the Security Council, sustainable development goals, development goals, and peace security.
mrged_df$phrase_tag <- as_phrasemachine(mrged_df$upos, type = "upos")
stats <- keywords_phrases(x = mrged_df$phrase_tag, term = tolower(mrged_df$token),
pattern = "(A|N)*N(P+D*(A|N)*N)*",
is_regex = TRUE, detailed = FALSE)
stats <- subset(stats, ngram > 1 & freq > 3)
stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
barchart(key ~ freq, data = head(stats, 20), col = "cadetblue",
main = "Keywords - simple noun phrases", xlab = "Frequency")
The results obtained with the Part-of-Speech (PoS) method exhibit similarities with those obtained through the Pointwise Mutual Information (PMI) method. The most frequently occurring terms identified are “United Nations,” “Mr. President,” “sustainable development,” “General Assembly,” “climate change,” “Security Council,” “sustainable development goals,” “development goals,” and “peace security.”
An n-gram refers to a consecutive sequence of n words extracted from a given text. For instance, a bigram is a combination of two words, where the value of n is equal to 2. This analysis provides an initial examination of the occurrence frequencies of the most commonly observed bigram (n=2) and trigram (n=3).
cooc <- cooccurrence(x = subset(mrged_df, upos %in% c("NOUN", "ADJ")),
term = "lemma",
group = c("doc_id", "paragraph_id", "sentence_id"))
library(ggraph)
library(ggplot2)
wordnetwork <- head(cooc, 30)
wordnetwork <- graph_from_data_frame(wordnetwork)
ggraph(wordnetwork, layout = "fr") +
geom_edge_link(aes(width = cooc, edge_alpha = cooc), edge_colour = "pink") +
geom_node_text(aes(label = name), col = "darkgreen", size = 4) +
theme_graph(base_family = "Arial Narrow") +
theme(legend.position = "none") +
labs(title = "Cooccurrences within sentence", subtitle = "Nouns & Adjective")
cooc <- cooccurrence(mrged_df$lemma, relevant = mrged_df$upos %in% c("NOUN", "ADJ"), skipgram = 1)
library(ggraph)
library(ggplot2)
wordnetwork <- head(cooc, 60)
wordnetwork <- graph_from_data_frame(wordnetwork)
ggraph(wordnetwork, layout = "fr") +
geom_edge_link(aes(width = cooc, edge_alpha = cooc)) +
geom_node_text(aes(label = name), col = "darkgreen", size = 4) +
theme_graph(base_family = "Arial Narrow") +
labs(title = "Words following one another", subtitle = "Nouns & Adjective")
In contrast to examining individual characteristic words, the analysis now shifts focus to characteristic n-grams per document. This approach visualizes combinations of words that are most representative of each country’s statement. The resulting network graph illustrates common co-occurrences, reaffirming findings from previous analyses (RAKE, PMI, PoS). Notably, terms like ‘President,’ ‘Sustainable development,’ ‘Progress,’ ‘Prosperity,’ ‘Peace,’ ‘The government,’ ‘Global solidarity,’ and variations of ‘United Nation’ consistently emerge. This aligns with the earlier frequency analysis, highlighting the persistence of key thematic elements across different analytical perspectives.
From this analysis we can tell the sentiment of each speech as delivered by each country.
# Update the Country_Code column in merged_df using the mapping
db_bing$Country_Name <- country_mapping[db_bing$Country_Code]
ggplot(db_bing, aes(x = n, y = fct_reorder(sentiment, n), fill = sentiment)) +
geom_col() +
geom_text(aes(label = n), hjust = -0.2, size = 3, color = "black") + # Add count labels on y-axis
facet_wrap(~ Country_Name, scales = "free_y") +
labs(title = "Country speech sentiment",
x = "Count",
y = "Sentiment") +
theme_minimal() +
guides(fill = FALSE) # Remove legend
Upon observation of the aforementioned figure, it becomes evident that the speeches, on average, had a favorable emotion. Examining Eritrea’s speech is of significant importance, as it stands as the sole speech expressing a negative viewpoint.